{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-117-5656dbe3f20c>:26: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "# 准备工作\n",
    "# 导入所需模块\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "from requests_html import HTMLSession\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入链接 进入知网\n",
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南...'"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查是否是 “中山大学南方学院”登录\n",
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击高级检索\n",
    "element = driver.find_element_by_id('highSearch').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-63B2B96DA7167A1773DB538806D44ED6'"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 当前窗口ID\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-63B2B96DA7167A1773DB538806D44ED6',\n",
       " 'CDwindow-621DC4615BF2C904A2945859EF503199']"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-123-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 学术期刊（分类可自选）\n",
    "element = driver.find_element_by_xpath('//li[@data-id=\"xsqk\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击专业检索\n",
    "element = driver.find_element_by_name('majorSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all\n",
    "element = driver.find_element_by_xpath('//input[@name=\"all\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置query\n",
    "query = 'SU = \"舞弊\" AND  (TI =\"乐视网\" OR  TI =\"建议\"  OR TI = \"financial\" OR TI = \"fraud\")'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入关键词\n",
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'575'"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检索文章的总数量\n",
    "element = driver.find_element_by_xpath('//*[@id=\"countPageDiv\"]/span[1]/em')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更换页面文章数量\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-sort\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更换页面文章数量 ——> 每页50\n",
    "element = driver.find_element_by_xpath('//div[@id=\"perPageDiv\"]//li[@data-val=\"50\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/12'"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检索论文内容一共有多少页？\n",
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>财务欺诈风险特征筛选框架的建立和应用  网络首发</td>\n",
       "      <td>袁先智;周云鹏;严诚幸;刘海洋;钱国骐</td>\n",
       "      <td>中国管理科学</td>\n",
       "      <td>2021-05-11 09:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>270.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>中小企业会计舞弊成因及对策探讨</td>\n",
       "      <td>卢增金</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析</td>\n",
       "      <td>王细韵</td>\n",
       "      <td>经济研究导刊</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>财务舞弊预防对策及建议探讨</td>\n",
       "      <td>陈利</td>\n",
       "      <td>会计师</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于舞弊三角理论的瑞幸咖啡财务造假案例研究</td>\n",
       "      <td>张潇丹</td>\n",
       "      <td>中国林业经济</td>\n",
       "      <td>2021-03-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3539.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>基于GONE理论视角下上市公司财务舞弊的解决措施和策略</td>\n",
       "      <td>陈朝骞</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-03-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>767.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>财务舞弊事件：手段、原因与启示——以金亚科技为例</td>\n",
       "      <td>李维</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-03-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1198.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>交易特征会影响上市公司的财务舞弊行为吗？——基于资产专用性的视角</td>\n",
       "      <td>王小语; 林冬冬</td>\n",
       "      <td>商展经济</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>58.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>基于GONE理论的农业上市公司财务舞弊案例研究——以獐子岛为例</td>\n",
       "      <td>姚正海; 张琳若</td>\n",
       "      <td>财务管理研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>766.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>财务报表分析技术在财务舞弊甄别中的应用</td>\n",
       "      <td>俞雅萍</td>\n",
       "      <td>财务管理研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>211.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>通过MBTI人格类型量表分析减少会计从业人员舞弊的方法</td>\n",
       "      <td>李珊珊</td>\n",
       "      <td>江苏商论</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>119.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>瑞幸咖啡财务舞弊事件分析——基于CRIME模型</td>\n",
       "      <td>孙辉; 章辉</td>\n",
       "      <td>生产力研究</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>416.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>基于舞弊三角论的财务造假问题分析——以抚顺特钢为例</td>\n",
       "      <td>白云; 李治堂; 张颖</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1132.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>基于管理会计的瑞幸咖啡公司财务舞弊研究</td>\n",
       "      <td>曾耀锐</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2164.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>公司治理结构与财务舞弊的关系实证研究——基于法务会计视角</td>\n",
       "      <td>刘桔林; 陈美芳</td>\n",
       "      <td>湖南财政经济学院学报</td>\n",
       "      <td>2021-01-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>608.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>公司财务舞弊的智能识别与模型优化策略</td>\n",
       "      <td>曾小青; 唐湘勇</td>\n",
       "      <td>长沙理工大学学报(社会科学版)</td>\n",
       "      <td>2021-01-27</td>\n",
       "      <td>NaN</td>\n",
       "      <td>435.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>平台型媒体中的商业舞弊治理研究——基于生命周期视角</td>\n",
       "      <td>张琦; 易开刚; 古家军</td>\n",
       "      <td>财经理论与实践</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>110.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>基于GONE理论的LK公司舞弊动因分析与审计风险防范</td>\n",
       "      <td>陈嘉伟; 邱杰</td>\n",
       "      <td>当代会计</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>81.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>财务舞弊、供应链集中度与企业商业信用融资</td>\n",
       "      <td>修宗峰; 刘然; 殷敬伟</td>\n",
       "      <td>会计研究</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>729.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>Deutsche Bank Agrees to Pay over $130 Million ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Department of Justice (DOJ) Documents / FIND</td>\n",
       "      <td>2021-01-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>The political demise of Metiria Turei: “fraud,...</td>\n",
       "      <td>Gray Claire</td>\n",
       "      <td>Feminist Media Studies</td>\n",
       "      <td>2021-01-02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>从瑞幸事件看财务舞弊、做空策略与审计防范</td>\n",
       "      <td>汤伟; 陈树; 王会芹</td>\n",
       "      <td>陕西广播电视大学学报</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>695.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>大数据技术在银行反舞弊审计中的应用——基于J银行审计实证研究</td>\n",
       "      <td>尹蕾</td>\n",
       "      <td>市场周刊</td>\n",
       "      <td>2020-12-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>310.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>社会责任信息披露与财务舞弊关系实证研究</td>\n",
       "      <td>任海芝; 刘雪; 张瑞雪</td>\n",
       "      <td>科技促进发展</td>\n",
       "      <td>2020-11-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>228.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>基于玻尔原子模型的财务舞弊动因研究——以瑞幸咖啡为例</td>\n",
       "      <td>宋文卿; 卿松</td>\n",
       "      <td>财务管理研究</td>\n",
       "      <td>2020-11-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>870.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>基于机器学习的上市公司财报舞弊识别前沿方法比较研究</td>\n",
       "      <td>黄志刚; 刘佳进; 林朝颖</td>\n",
       "      <td>系统科学与数学</td>\n",
       "      <td>2020-10-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>295.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>会计电算化环境下会计舞弊行为及对策研究——以JA公司为例</td>\n",
       "      <td>姚倩</td>\n",
       "      <td>湖南工业职业技术学院学报</td>\n",
       "      <td>2020-10-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>249.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>K公司财务舞弊动因与防范——基于GONE理论视角</td>\n",
       "      <td>刘丽萍; 赵任昊</td>\n",
       "      <td>市场周刊</td>\n",
       "      <td>2020-10-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>482.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>大数据时代对上市公司财务舞弊的影响——研究综述及展望</td>\n",
       "      <td>张力派; 程晨; 陈玲玲</td>\n",
       "      <td>管理现代化</td>\n",
       "      <td>2020-09-28 11:19</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3478.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>商业银行互联网线上信贷业务反欺诈与舞弊内部审计——基于部分典型案例的分析与思考</td>\n",
       "      <td>詹向勇</td>\n",
       "      <td>中国内部审计</td>\n",
       "      <td>2020-09-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>663.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>组织认同与亲组织财务报告舞弊决策——多重中介效应分析</td>\n",
       "      <td>陈邑早; 张莹; 孔晨</td>\n",
       "      <td>经济管理</td>\n",
       "      <td>2020-09-14 13:21</td>\n",
       "      <td>1.0</td>\n",
       "      <td>742.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>医疗保险反欺诈预警机制设计研究</td>\n",
       "      <td>牛秀粉</td>\n",
       "      <td>财会通讯</td>\n",
       "      <td>2020-09-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>280.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>我国林业上市公司会计舞弊动因与治理研究</td>\n",
       "      <td>尹梦瑶; 李登明</td>\n",
       "      <td>中国林业经济</td>\n",
       "      <td>2020-08-13</td>\n",
       "      <td>3.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>研发投资、舞弊风险与审计费用</td>\n",
       "      <td>马广奇; 张保平; 沈李欢</td>\n",
       "      <td>南京审计大学学报</td>\n",
       "      <td>2020-07-17 13:22</td>\n",
       "      <td>3.0</td>\n",
       "      <td>857.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>区块链技术下的医院财务管理平台建设</td>\n",
       "      <td>唐衍军; 黄益; 蒋翠珍</td>\n",
       "      <td>卫生经济研究</td>\n",
       "      <td>2020-07-02 14:03</td>\n",
       "      <td>3.0</td>\n",
       "      <td>498.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>区块链技术下会计舞弊问题探究</td>\n",
       "      <td>王珂</td>\n",
       "      <td>对外经贸</td>\n",
       "      <td>2020-06-29</td>\n",
       "      <td>1.0</td>\n",
       "      <td>336.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>风险导向社会工作服务项目的财务评估</td>\n",
       "      <td>温欣</td>\n",
       "      <td>山东工商学院学报</td>\n",
       "      <td>2020-06-15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>218.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>基于区块链技术下上市企业舞弊审计的研究</td>\n",
       "      <td>谢佩君; 陈恒; 曹奕</td>\n",
       "      <td>湖南财政经济学院学报</td>\n",
       "      <td>2020-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>231.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>大学生舞弊现状调查分析——以A高校为例</td>\n",
       "      <td>周嘉琳; 曾诗棋; 李洪毅</td>\n",
       "      <td>科教导刊(中旬刊)</td>\n",
       "      <td>2020-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>基于GONE理论的欢瑞影视财务舞弊问题研究</td>\n",
       "      <td>韦懿桐</td>\n",
       "      <td>价值工程</td>\n",
       "      <td>2020-06-08</td>\n",
       "      <td>2.0</td>\n",
       "      <td>633.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>上市公司财务舞弊的成因与治理研究——以瑞幸咖啡公司为例</td>\n",
       "      <td>郑丽萍; 赵杨</td>\n",
       "      <td>管理现代化</td>\n",
       "      <td>2020-06-04 09:59</td>\n",
       "      <td>12.0</td>\n",
       "      <td>12566.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>管理层业绩目标、内部控制有效性与财务舞弊</td>\n",
       "      <td>余思明; 唐建新; 孙辉东</td>\n",
       "      <td>预测</td>\n",
       "      <td>2020-05-28 13:36</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1519.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>内部人举报制度、舞弊风险与审计定价</td>\n",
       "      <td>汶海; 李培功</td>\n",
       "      <td>审计研究</td>\n",
       "      <td>2020-05-28</td>\n",
       "      <td>1.0</td>\n",
       "      <td>799.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>抗战胜利前后国民政府惩治贪腐探析——以黄金提价舞弊案为中心</td>\n",
       "      <td>辜雅</td>\n",
       "      <td>民国档案</td>\n",
       "      <td>2020-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>105.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>基于GONE理论的*ST昆机财务舞弊动因分析</td>\n",
       "      <td>张继德; 姜园园</td>\n",
       "      <td>财务管理研究</td>\n",
       "      <td>2020-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>607.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>风险导向审计模式在管理舞弊审计中的运用</td>\n",
       "      <td>王晓茜; 徐志耀</td>\n",
       "      <td>江苏商论</td>\n",
       "      <td>2020-05-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>484.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>瑞幸咖啡财务舞弊案例分析</td>\n",
       "      <td>黄佳琦; 宋夏云</td>\n",
       "      <td>财务管理研究</td>\n",
       "      <td>2020-05-20</td>\n",
       "      <td>19.0</td>\n",
       "      <td>14348.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>稀疏组Lasso-logistic回归模型在财务报告舞弊识别中的应用研究</td>\n",
       "      <td>王威</td>\n",
       "      <td>数学的实践与认识</td>\n",
       "      <td>2020-05-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>333.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>“高存高贷”下的公司财务舞弊——以康美药业财务舞弊为例</td>\n",
       "      <td>赵春艳</td>\n",
       "      <td>郑州航空工业管理学院学报</td>\n",
       "      <td>2020-04-26</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1955.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>Losing body weight for money: How provider-sid...</td>\n",
       "      <td>Hochuli Philip</td>\n",
       "      <td>Health economics</td>\n",
       "      <td>2020-04-17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                           财务欺诈风险特征筛选框架的建立和应用  网络首发   \n",
       "1            2                                    中小企业会计舞弊成因及对策探讨   \n",
       "2            3   央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析   \n",
       "3            4                                      财务舞弊预防对策及建议探讨   \n",
       "4            5                              基于舞弊三角理论的瑞幸咖啡财务造假案例研究   \n",
       "5            6                        基于GONE理论视角下上市公司财务舞弊的解决措施和策略   \n",
       "6            7                           财务舞弊事件：手段、原因与启示——以金亚科技为例   \n",
       "7            8                   交易特征会影响上市公司的财务舞弊行为吗？——基于资产专用性的视角   \n",
       "8            9                    基于GONE理论的农业上市公司财务舞弊案例研究——以獐子岛为例   \n",
       "9           10                                财务报表分析技术在财务舞弊甄别中的应用   \n",
       "10          11                        通过MBTI人格类型量表分析减少会计从业人员舞弊的方法   \n",
       "11          12                            瑞幸咖啡财务舞弊事件分析——基于CRIME模型   \n",
       "12          13                          基于舞弊三角论的财务造假问题分析——以抚顺特钢为例   \n",
       "13          14                                基于管理会计的瑞幸咖啡公司财务舞弊研究   \n",
       "14          15                       公司治理结构与财务舞弊的关系实证研究——基于法务会计视角   \n",
       "15          16                                 公司财务舞弊的智能识别与模型优化策略   \n",
       "16          17                          平台型媒体中的商业舞弊治理研究——基于生命周期视角   \n",
       "17          18                         基于GONE理论的LK公司舞弊动因分析与审计风险防范   \n",
       "18          19                               财务舞弊、供应链集中度与企业商业信用融资   \n",
       "19          20  Deutsche Bank Agrees to Pay over $130 Million ...   \n",
       "20          21  The political demise of Metiria Turei: “fraud,...   \n",
       "21          22                               从瑞幸事件看财务舞弊、做空策略与审计防范   \n",
       "22          23                     大数据技术在银行反舞弊审计中的应用——基于J银行审计实证研究   \n",
       "23          24                                社会责任信息披露与财务舞弊关系实证研究   \n",
       "24          25                         基于玻尔原子模型的财务舞弊动因研究——以瑞幸咖啡为例   \n",
       "25          26                          基于机器学习的上市公司财报舞弊识别前沿方法比较研究   \n",
       "26          27                       会计电算化环境下会计舞弊行为及对策研究——以JA公司为例   \n",
       "27          28                           K公司财务舞弊动因与防范——基于GONE理论视角   \n",
       "28          29                         大数据时代对上市公司财务舞弊的影响——研究综述及展望   \n",
       "29          30            商业银行互联网线上信贷业务反欺诈与舞弊内部审计——基于部分典型案例的分析与思考   \n",
       "30          31                         组织认同与亲组织财务报告舞弊决策——多重中介效应分析   \n",
       "31          32                                    医疗保险反欺诈预警机制设计研究   \n",
       "32          33                                我国林业上市公司会计舞弊动因与治理研究   \n",
       "33          34                                     研发投资、舞弊风险与审计费用   \n",
       "34          35                                  区块链技术下的医院财务管理平台建设   \n",
       "35          36                                     区块链技术下会计舞弊问题探究   \n",
       "36          37                                  风险导向社会工作服务项目的财务评估   \n",
       "37          38                                基于区块链技术下上市企业舞弊审计的研究   \n",
       "38          39                                大学生舞弊现状调查分析——以A高校为例   \n",
       "39          40                              基于GONE理论的欢瑞影视财务舞弊问题研究   \n",
       "40          41                        上市公司财务舞弊的成因与治理研究——以瑞幸咖啡公司为例   \n",
       "41          42                               管理层业绩目标、内部控制有效性与财务舞弊   \n",
       "42          43                                  内部人举报制度、舞弊风险与审计定价   \n",
       "43          44                      抗战胜利前后国民政府惩治贪腐探析——以黄金提价舞弊案为中心   \n",
       "44          45                             基于GONE理论的*ST昆机财务舞弊动因分析   \n",
       "45          46                                风险导向审计模式在管理舞弊审计中的运用   \n",
       "46          47                                       瑞幸咖啡财务舞弊案例分析   \n",
       "47          48               稀疏组Lasso-logistic回归模型在财务报告舞弊识别中的应用研究   \n",
       "48          49                        “高存高贷”下的公司财务舞弊——以康美药业财务舞弊为例   \n",
       "49          50  Losing body weight for money: How provider-sid...   \n",
       "\n",
       "                     作者                                            刊名  \\\n",
       "0   袁先智;周云鹏;严诚幸;刘海洋;钱国骐                                        中国管理科学   \n",
       "1                   卢增金                                          中国商论   \n",
       "2                   王细韵                                        经济研究导刊   \n",
       "3                    陈利                                           会计师   \n",
       "4                   张潇丹                                        中国林业经济   \n",
       "5                   陈朝骞                                          中国商论   \n",
       "6                    李维                                          中国商论   \n",
       "7              王小语; 林冬冬                                          商展经济   \n",
       "8              姚正海; 张琳若                                        财务管理研究   \n",
       "9                   俞雅萍                                        财务管理研究   \n",
       "10                  李珊珊                                          江苏商论   \n",
       "11               孙辉; 章辉                                         生产力研究   \n",
       "12          白云; 李治堂; 张颖                                          中国商论   \n",
       "13                  曾耀锐                                          中国商论   \n",
       "14             刘桔林; 陈美芳                                    湖南财政经济学院学报   \n",
       "15             曾小青; 唐湘勇                               长沙理工大学学报(社会科学版)   \n",
       "16         张琦; 易开刚; 古家军                                       财经理论与实践   \n",
       "17              陈嘉伟; 邱杰                                          当代会计   \n",
       "18         修宗峰; 刘然; 殷敬伟                                          会计研究   \n",
       "19                  NaN  Department of Justice (DOJ) Documents / FIND   \n",
       "20          Gray Claire                        Feminist Media Studies   \n",
       "21          汤伟; 陈树; 王会芹                                    陕西广播电视大学学报   \n",
       "22                   尹蕾                                          市场周刊   \n",
       "23         任海芝; 刘雪; 张瑞雪                                        科技促进发展   \n",
       "24              宋文卿; 卿松                                        财务管理研究   \n",
       "25        黄志刚; 刘佳进; 林朝颖                                       系统科学与数学   \n",
       "26                   姚倩                                  湖南工业职业技术学院学报   \n",
       "27             刘丽萍; 赵任昊                                          市场周刊   \n",
       "28         张力派; 程晨; 陈玲玲                                         管理现代化   \n",
       "29                  詹向勇                                        中国内部审计   \n",
       "30          陈邑早; 张莹; 孔晨                                          经济管理   \n",
       "31                  牛秀粉                                          财会通讯   \n",
       "32             尹梦瑶; 李登明                                        中国林业经济   \n",
       "33        马广奇; 张保平; 沈李欢                                      南京审计大学学报   \n",
       "34         唐衍军; 黄益; 蒋翠珍                                        卫生经济研究   \n",
       "35                   王珂                                          对外经贸   \n",
       "36                   温欣                                      山东工商学院学报   \n",
       "37          谢佩君; 陈恒; 曹奕                                    湖南财政经济学院学报   \n",
       "38        周嘉琳; 曾诗棋; 李洪毅                                     科教导刊(中旬刊)   \n",
       "39                  韦懿桐                                          价值工程   \n",
       "40              郑丽萍; 赵杨                                         管理现代化   \n",
       "41        余思明; 唐建新; 孙辉东                                            预测   \n",
       "42              汶海; 李培功                                          审计研究   \n",
       "43                   辜雅                                          民国档案   \n",
       "44             张继德; 姜园园                                        财务管理研究   \n",
       "45             王晓茜; 徐志耀                                          江苏商论   \n",
       "46             黄佳琦; 宋夏云                                        财务管理研究   \n",
       "47                   王威                                      数学的实践与认识   \n",
       "48                  赵春艳                                  郑州航空工业管理学院学报   \n",
       "49       Hochuli Philip                              Health economics   \n",
       "\n",
       "                发表时间    被引       下载   操作  \n",
       "0   2021-05-11 09:35   NaN    270.0   下载  \n",
       "1         2021-05-10   NaN    180.0   下载  \n",
       "2         2021-04-25   NaN     29.0   下载  \n",
       "3         2021-04-15   NaN     84.0   下载  \n",
       "4         2021-03-26   NaN   3539.0   下载  \n",
       "5         2021-03-23   1.0    767.0   下载  \n",
       "6         2021-03-12   NaN   1198.0   下载  \n",
       "7         2021-02-28   NaN     58.0   下载  \n",
       "8         2021-02-20   NaN    766.0   下载  \n",
       "9         2021-02-20   NaN    211.0   下载  \n",
       "10        2021-02-20   NaN    119.0   下载  \n",
       "11        2021-02-15   NaN    416.0   下载  \n",
       "12        2021-02-07   1.0   1132.0   下载  \n",
       "13        2021-02-07   1.0   2164.0   下载  \n",
       "14        2021-01-28   NaN    608.0   下载  \n",
       "15        2021-01-27   NaN    435.0   下载  \n",
       "16        2021-01-25   NaN    110.0   下载  \n",
       "17        2021-01-15   NaN     81.0   下载  \n",
       "18        2021-01-15   NaN    729.0   下载  \n",
       "19        2021-01-08   NaN      NaN  NaN  \n",
       "20        2021-01-02   NaN      NaN  NaN  \n",
       "21        2020-12-15   NaN    695.0   下载  \n",
       "22        2020-12-01   NaN    310.0   下载  \n",
       "23        2020-11-20   NaN    228.0   下载  \n",
       "24        2020-11-20   1.0    870.0   下载  \n",
       "25        2020-10-15   NaN    295.0   下载  \n",
       "26        2020-10-15   NaN    249.0   下载  \n",
       "27        2020-10-01   NaN    482.0   下载  \n",
       "28  2020-09-28 11:19   2.0   3478.0   下载  \n",
       "29        2020-09-15   2.0    663.0   下载  \n",
       "30  2020-09-14 13:21   1.0    742.0   下载  \n",
       "31        2020-09-08   NaN    280.0   下载  \n",
       "32        2020-08-13   3.0    257.0   下载  \n",
       "33  2020-07-17 13:22   3.0    857.0   下载  \n",
       "34  2020-07-02 14:03   3.0    498.0   下载  \n",
       "35        2020-06-29   1.0    336.0   下载  \n",
       "36        2020-06-15   1.0    218.0   下载  \n",
       "37        2020-06-15   NaN    231.0   下载  \n",
       "38        2020-06-15   NaN      6.0   下载  \n",
       "39        2020-06-08   2.0    633.0   下载  \n",
       "40  2020-06-04 09:59  12.0  12566.0   下载  \n",
       "41  2020-05-28 13:36   4.0   1519.0   下载  \n",
       "42        2020-05-28   1.0    799.0   下载  \n",
       "43        2020-05-25   NaN    105.0   下载  \n",
       "44        2020-05-20   NaN    607.0   下载  \n",
       "45        2020-05-20   1.0    484.0   下载  \n",
       "46        2020-05-20  19.0  14348.0   下载  \n",
       "47        2020-05-08   NaN    333.0   下载  \n",
       "48        2020-04-26   1.0   1955.0   下载  \n",
       "49        2020-04-17   NaN      NaN  NaN  "
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 抓取首页（第一页）页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "含有页面主要数据的表格html_ = element.get_attribute('innerHTML')\n",
    "首页主要数据 = pd.read_html(含有页面主要数据的表格html_)[0]\n",
    "首页主要数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 翻页\n",
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文章数量总计: 575\n",
      "结果页数总计: 1\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "import time\n",
    "from random import random\n",
    "import requests_html\n",
    "element = driver.find_element_by_xpath('//html')\n",
    "main_content =element.get_attribute('outerHTML')  \n",
    "html = requests_html.HTML(html= main_content, url='https://localhost/')\n",
    "文章数量 = html.xpath('//*[@id=\"countPageDiv\"]/span[1]/em')[0].lxml.text_content()\n",
    "print(\"文章数量总计:\",文章数量)\n",
    "结果页数 = html.xpath('//*[@id=\"gridTable\"]/div[2]/span[1]')[0].lxml.text_content()[1:2]\n",
    "print(\"结果页数总计:\",结果页数)\n",
    "pagenum = int(结果页数)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n"
     ]
    }
   ],
   "source": [
    "# 所有页数\n",
    "pages = list(range(1,12))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 函数（翻页）\n",
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 定位到“下一页”的按钮 ——> 点击\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设定休息的时间 ——> 避免爬虫被禁报错、以及出现验证码\n",
    "        time.sleep(30+20*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t"
     ]
    }
   ],
   "source": [
    "# 翻页操作\n",
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "# 指定内容输出的位置\n",
    "fn = { \"output\" : { \"htm_snippets\": \"data_raw_src/知网_htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存页面内容的csv文件\n",
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>财务欺诈风险特征筛选框架的建立和应用  网络首发</td>\n",
       "      <td>袁先智;周云鹏;严诚幸;刘海洋;钱国骐</td>\n",
       "      <td>中国管理科学</td>\n",
       "      <td>2021-05-11 09:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>270.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>中小企业会计舞弊成因及对策探讨</td>\n",
       "      <td>卢增金</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析</td>\n",
       "      <td>王细韵</td>\n",
       "      <td>经济研究导刊</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>财务舞弊预防对策及建议探讨</td>\n",
       "      <td>陈利</td>\n",
       "      <td>会计师</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于舞弊三角理论的瑞幸咖啡财务造假案例研究</td>\n",
       "      <td>张潇丹</td>\n",
       "      <td>中国林业经济</td>\n",
       "      <td>2021-03-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3539.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>520</th>\n",
       "      <td>571</td>\n",
       "      <td>Academic fraud: Prevalence, practices, and rea...</td>\n",
       "      <td>Hilbert Gail A.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1987-01-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>521</th>\n",
       "      <td>572</td>\n",
       "      <td>Fraud and science.</td>\n",
       "      <td>Dworkin G</td>\n",
       "      <td>Progress in clinical and biological research</td>\n",
       "      <td>1983-07-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>522</th>\n",
       "      <td>573</td>\n",
       "      <td>Scientific fraud. The system defends itself.</td>\n",
       "      <td>David P</td>\n",
       "      <td>Nature</td>\n",
       "      <td>1983-07-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>523</th>\n",
       "      <td>574</td>\n",
       "      <td>Frauds and cheats.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Journal. Mercer Dental Society, Trenton</td>\n",
       "      <td>1976-04-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>524</th>\n",
       "      <td>575</td>\n",
       "      <td>Pupil Cheating: Report on Dishonesty among 241...</td>\n",
       "      <td>Lyle H. Johnson</td>\n",
       "      <td>The Clearing House</td>\n",
       "      <td>1943-10-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>575 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0             1                           财务欺诈风险特征筛选框架的建立和应用  网络首发   \n",
       "1             2                                    中小企业会计舞弊成因及对策探讨   \n",
       "2             3   央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析   \n",
       "3             4                                      财务舞弊预防对策及建议探讨   \n",
       "4             5                              基于舞弊三角理论的瑞幸咖啡财务造假案例研究   \n",
       "..          ...                                                ...   \n",
       "520         571  Academic fraud: Prevalence, practices, and rea...   \n",
       "521         572                                 Fraud and science.   \n",
       "522         573       Scientific fraud. The system defends itself.   \n",
       "523         574                                 Frauds and cheats.   \n",
       "524         575  Pupil Cheating: Report on Dishonesty among 241...   \n",
       "\n",
       "                      作者                                            刊名  \\\n",
       "0    袁先智;周云鹏;严诚幸;刘海洋;钱国骐                                        中国管理科学   \n",
       "1                    卢增金                                          中国商论   \n",
       "2                    王细韵                                        经济研究导刊   \n",
       "3                     陈利                                           会计师   \n",
       "4                    张潇丹                                        中国林业经济   \n",
       "..                   ...                                           ...   \n",
       "520      Hilbert Gail A.                                           NaN   \n",
       "521            Dworkin G  Progress in clinical and biological research   \n",
       "522              David P                                        Nature   \n",
       "523                  NaN       Journal. Mercer Dental Society, Trenton   \n",
       "524      Lyle H. Johnson                            The Clearing House   \n",
       "\n",
       "                 发表时间  被引      下载   操作  \n",
       "0    2021-05-11 09:35 NaN   270.0   下载  \n",
       "1          2021-05-10 NaN   180.0   下载  \n",
       "2          2021-04-25 NaN    29.0   下载  \n",
       "3          2021-04-15 NaN    84.0   下载  \n",
       "4          2021-03-26 NaN  3539.0   下载  \n",
       "..                ...  ..     ...  ...  \n",
       "520        1987-01-01 NaN     NaN  NaN  \n",
       "521        1983-07-12 NaN     NaN  NaN  \n",
       "522        1983-07-12 NaN     NaN  NaN  \n",
       "523        1976-04-10 NaN     NaN  NaN  \n",
       "524        1943-10-01 NaN     NaN  NaN  \n",
       "\n",
       "[575 rows x 8 columns]"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表 = 首页主要数据.append(df_url_out)\n",
    "df_总表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>财务欺诈风险特征筛选框架的建立和应用  网络首发</td>\n",
       "      <td>袁先智;周云鹏;严诚幸;刘海洋;钱国骐</td>\n",
       "      <td>中国管理科学</td>\n",
       "      <td>2021-05-11 09:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>270.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>中小企业会计舞弊成因及对策探讨</td>\n",
       "      <td>卢增金</td>\n",
       "      <td>中国商论</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析</td>\n",
       "      <td>王细韵</td>\n",
       "      <td>经济研究导刊</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>财务舞弊预防对策及建议探讨</td>\n",
       "      <td>陈利</td>\n",
       "      <td>会计师</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于舞弊三角理论的瑞幸咖啡财务造假案例研究</td>\n",
       "      <td>张潇丹</td>\n",
       "      <td>中国林业经济</td>\n",
       "      <td>2021-03-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3539.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>520</th>\n",
       "      <td>571</td>\n",
       "      <td>Academic fraud: Prevalence, practices, and rea...</td>\n",
       "      <td>Hilbert Gail A.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1987-01-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>521</th>\n",
       "      <td>572</td>\n",
       "      <td>Fraud and science.</td>\n",
       "      <td>Dworkin G</td>\n",
       "      <td>Progress in clinical and biological research</td>\n",
       "      <td>1983-07-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>522</th>\n",
       "      <td>573</td>\n",
       "      <td>Scientific fraud. The system defends itself.</td>\n",
       "      <td>David P</td>\n",
       "      <td>Nature</td>\n",
       "      <td>1983-07-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>523</th>\n",
       "      <td>574</td>\n",
       "      <td>Frauds and cheats.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Journal. Mercer Dental Society, Trenton</td>\n",
       "      <td>1976-04-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>524</th>\n",
       "      <td>575</td>\n",
       "      <td>Pupil Cheating: Report on Dishonesty among 241...</td>\n",
       "      <td>Lyle H. Johnson</td>\n",
       "      <td>The Clearing House</td>\n",
       "      <td>1943-10-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>575 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0             1                           财务欺诈风险特征筛选框架的建立和应用  网络首发   \n",
       "1             2                                    中小企业会计舞弊成因及对策探讨   \n",
       "2             3   央企财务收支审计高风险领域及对策研究——基于审计署对35户央企2016年度财务审计结果公告的分析   \n",
       "3             4                                      财务舞弊预防对策及建议探讨   \n",
       "4             5                              基于舞弊三角理论的瑞幸咖啡财务造假案例研究   \n",
       "..          ...                                                ...   \n",
       "520         571  Academic fraud: Prevalence, practices, and rea...   \n",
       "521         572                                 Fraud and science.   \n",
       "522         573       Scientific fraud. The system defends itself.   \n",
       "523         574                                 Frauds and cheats.   \n",
       "524         575  Pupil Cheating: Report on Dishonesty among 241...   \n",
       "\n",
       "                      作者                                            刊名  \\\n",
       "0    袁先智;周云鹏;严诚幸;刘海洋;钱国骐                                        中国管理科学   \n",
       "1                    卢增金                                          中国商论   \n",
       "2                    王细韵                                        经济研究导刊   \n",
       "3                     陈利                                           会计师   \n",
       "4                    张潇丹                                        中国林业经济   \n",
       "..                   ...                                           ...   \n",
       "520      Hilbert Gail A.                                           NaN   \n",
       "521            Dworkin G  Progress in clinical and biological research   \n",
       "522              David P                                        Nature   \n",
       "523                  NaN       Journal. Mercer Dental Society, Trenton   \n",
       "524      Lyle H. Johnson                            The Clearing House   \n",
       "\n",
       "                 发表时间  被引      下载   操作  \n",
       "0    2021-05-11 09:35 NaN   270.0   下载  \n",
       "1          2021-05-10 NaN   180.0   下载  \n",
       "2          2021-04-25 NaN    29.0   下载  \n",
       "3          2021-04-15 NaN    84.0   下载  \n",
       "4          2021-03-26 NaN  3539.0   下载  \n",
       "..                ...  ..     ...  ...  \n",
       "520        1987-01-01 NaN     NaN  NaN  \n",
       "521        1983-07-12 NaN     NaN  NaN  \n",
       "522        1983-07-12 NaN     NaN  NaN  \n",
       "523        1976-04-10 NaN     NaN  NaN  \n",
       "524        1943-10-01 NaN     NaN  NaN  \n",
       "\n",
       "[575 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 将内容表格存在本地\n",
    "with pd.ExcelWriter('Selenium知网数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_总表)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 尝试导出refworks文件（.txt） 和 下载原文\n",
    "# 经历过翻页以后 回去首页\n",
    "element = driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 翻页\n",
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "导出_html = dict()\n",
    "main_content_ =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "# 导出refworks文件（.txt）和下载文章 —— 页面选中（50）篇 —— 翻页在选中 （此操作一直循环） ！！注意：每次全选不能超过500篇，故此次爬取905篇文章，分2次进行\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选中页面50篇 —> 翻页\n",
    "def process_xuanzhong (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(30+20*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "process_xuanzhong (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出文献\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-63B2B96DA7167A1773DB538806D44ED6',\n",
       " 'CDwindow-621DC4615BF2C904A2945859EF503199',\n",
       " 'CDwindow-FA98B5485E5C23254755FE3B9F0573BC']"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-163-520070efe65b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-166-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量下载\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-190-1f3bb34cc9cb>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（500篇）\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-192-8b82f1e63dce>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换 ——> 返回论文列表页\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除所选的500篇文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[10, 11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(10,19))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选中页面50篇 —> 翻页\n",
    "def process_xuanzhong (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(30+20*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\t"
     ]
    },
    {
     "ename": "NoSuchElementException",
     "evalue": "Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"[id=\"PageNext\"]\"}\n  (Session info: chrome=91.0.4472.77)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNoSuchElementException\u001b[0m                    Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-197-ee7190366ebf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprocess_xuanzhong\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mpages\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-195-426deddc0456>\u001b[0m in \u001b[0;36mprocess_xuanzhong\u001b[1;34m(pages)\u001b[0m\n\u001b[0;32m      5\u001b[0m         \u001b[0m全选\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'selectCheckAll1'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0m全选\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m         \u001b[0m跳转\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'PageNext'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      8\u001b[0m         \u001b[0m跳转\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m30\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m20\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_id\u001b[1;34m(self, id_)\u001b[0m\n\u001b[0;32m    358\u001b[0m             \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'foo'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    359\u001b[0m         \"\"\"\n\u001b[1;32m--> 360\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mID\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    361\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m    974\u001b[0m                 \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    975\u001b[0m                 \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'[name=\"%s\"]'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 976\u001b[1;33m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[0;32m    977\u001b[0m             \u001b[1;34m'using'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    978\u001b[0m             'value': value})['value']\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNoSuchElementException\u001b[0m: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"[id=\"PageNext\"]\"}\n  (Session info: chrome=91.0.4472.77)\n"
     ]
    }
   ],
   "source": [
    "process_xuanzhong (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
